Análisis Exploratorio de Datos (EDA)#
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from phik import phik_matrix
from numpy import log10, NaN
from seaborn import heatmap, set_theme
from json import load
from urllib.request import urlopen
from plotly.subplots import make_subplots
set_theme(style="ticks", context="talk", palette="tab10")
plt.rcParams.update({'font.size': 12})
Archivo GeoJSON para trazar los mapas:
with urlopen('https://gist.githubusercontent.com/john-guerra/43c7656821069d00dcbc/raw/be6a6e239cd5b5b803c6e7c2ec405b793a9064dd/Colombia.geo.json') as response:
counties = load(response)
df = pd.read_parquet('Data/data_cleaned.parquet')
df_factorize = df.apply(lambda x : pd.factorize(x)[0])
df_corr = df_factorize.phik_matrix(interval_cols=list(df.columns)).copy()
Como podemos ver en el mapa de calor, hay una relación entre las columnas de género y grupo_etario.
import plotly.express as px
fig = px.imshow(df_corr.values,
labels=dict(x="", y="", color="Correlación"),
x=df_corr.columns,
y=df_corr.columns,
color_continuous_scale='RdBu')
fig.update_layout(title="Mapa de calor de correlación",
width=800,
height=600)
fig.show()
Distribución De los Datos:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd
num_columns = len(df.columns)
num_rows = num_columns // 2 if num_columns % 2 == 0 else (num_columns // 2) + 1
fig = make_subplots(rows=num_rows, cols=2, subplot_titles=df.columns)
for i, column in enumerate(df.columns):
fig.add_trace(
go.Histogram(x=df[column], nbinsx=50, name=column),
row=(i // 2) + 1,
col=(i % 2) + 1
)
fig.update_layout(
width=800,
height=500 * num_rows,
title_text="Histograms of DataFrame Columns"
)
fig.show()